#!/usr/bin/env python
# coding: utf-8

# In[8]:


#tree slicing 
from ete3 import Tree
t = Tree('E:\MSc Applied Biotechnology\Introduction literature\Project_master_thesis/All_HA_march2020_cls95.fasta.final_tree.nw')
#print (t)
h=6.2 # the height value at which the tree will be trimmed 

first_node = t.get_tree_root()

all_final_leaf=[]
for nod in t.traverse('preorder'):
    if len(nod.name) > 0:
        all_final_leaf.append(nod.name)


key_nodes=[]

for nod in t.traverse('preorder'):
    distance=t.get_distance(nod, first_node)
    if distance > h and not True in [True for x in nod.get_ancestors() if [True for wazne in key_nodes if x ==wazne]]:
        key_nodes.append(nod)
        
key_nodes_names=[]
for nod in key_nodes:
    if nod.is_leaf():
        key_nodes_names.append([str(nod.name)])
    else:
        #key_nodes_names.append(",".join([str(x.name) for x in nod.get_leaves()]))
        key_nodes_names.append([str(x.name) for x in nod.get_leaves()])
        
        
flatten_list=[y for x in key_nodes_names for y in x if ',' not in y]

for leaf in all_final_leaf:
    if leaf not in flatten_list:
        key_nodes_names.append([leaf])

print(len(key_nodes_names))
with open('HA_100.fa', 'w') as f:
    for element in key_nodes_names:
        f.write(",".join(map(str, element))+"\n")


# In[9]:


file_x = open ('C:/Users/abo_a/HA_100.fasta','r')
first_list = []
i = 0
for row in file_x:
    row = row.rstrip()
    if ',' in row:
        first_list.append(row.split(','))
    else:
        first_list.append([row])


# In[27]:


#print(first_list)


# In[10]:


a = open ('E:\MSc Applied Biotechnology\Introduction literature\Project_master_thesis\HA_all_sequences.fasta','r')

i = 0
second_list = []
for row in a:
    #print(row)
    if ">" in row and i <= 2:
        print(row)
        #print(i)
        #i +=1
        second_list.append(row)


# In[11]:


my_dic={}
for element in second_list:
    #print(element)
    key=element.split('|')[0].split('>')[1]
    value=element.split('|')[8].split(':')[1]
    #print(key)
    #print(value)
    my_dic[key]=value


# In[12]:


#to change file frome gb_name to HA_subtype
subtree_subtypes=[]
for tree in first_list:
    subtree_subtypes.append ([my_dic[x.replace("_",":")] for x in tree])


# In[13]:


with open('HA_100.fasta', 'w') as f:
    for subtree in subtree_subtypes:
        f.write(','.join([str(x) for x in subtree])+"\n")


# In[8]:


'gb_CY076165'.replace("_",":")


# In[ ]:





# In[25]:


ala=['gb_CY006032','gb_KY635719']


# In[26]:


#to create sequence Fasta format file 
a = open ('Downloads\Project_master_thesis\HA_all_sequences.fasta','r')

control=0

for row in a:
    #print(row)
    if control==1 and ">" not in row:
        f.write(str(row))
    if ">" in row:
        key=row.split('|')[0].split('>')[1].replace(":","_")
        if key in ala:
            f.close()            
            control=1
            f=open(key+'.fasta','w')
            f.write('>'+key+'\n')
        else:
            control=0

    
                
f.close()


# In[ ]:





# In[10]:


q = [8,7,3,4,5]


# In[11]:


def avg (num):
    sum_num = 0
    for t in num:
        sum_num = sum_num + t           

    avg = sum_num / len(num)
    return avg

print(avg(q))


# In[ ]:




